import time
from itertools import islice
import sys

from gensim.corpora.dictionary import Dictionary

from ..utils import filter_words


def doc_generator(corpus_fp, do_filter=True):
    start = time.perf_counter()
    counter = 0
    with open(corpus_fp, encoding='utf-8') as fi:
        while 1:
            lines = list(islice(fi, 10000))
            if not lines:
                break
            for line in lines:
                words = [w.strip() for w in line.strip().split()]
                if do_filter:
                    words = filter_words(words)
                if len(words) == 0:
                    continue
                counter += 1
                if counter % 1000 == 0:
                    print('\r%d/%fs..' % (counter, time.perf_counter() - start), end='')
                yield words


def run():
    corpus_fp = sys.argv[2]
    dictionary = Dictionary(doc_generator(corpus_fp))
    dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=300000)
    save_fp = sys.argv[3]
    dictionary.save(save_fp)
